Transformative Change Assessment - Ch 3 - Clusters

Back to Readme

Waiting for new key-papers

Setup

Show the code
#|
if (!exists("params")) {
    params <- rmarkdown::yaml_front_matter("snowball.qmd")$params
}

knitr::opts_chunk$set(message = NA)

library(IPBES.R)
library(openalexR)
Thank you for using openalexR!
To acknowledge our work, please cite the package by calling `citation("openalexR")`.
To suppress this message, add `openalexR.message = suppressed` to your .Renviron file.
Show the code
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
Show the code
library(readxl)

library(tibble)
# library(dplyr)
# library(knitr)
library(networkD3)
library(tictoc)

library(patchwork)

# devtools::install_github("mattflor/chorddiag")
library(chorddiag)

dir.create(params$gdm_path, showWarnings = FALSE)

fns <- sapply(
    1:6,
    function(i) {
        gsub(
            pattern = "X",
            replacement = i,
            x = file.path("input", "key_papers", params$key_paper)
        )
    }
)

###########
## Update `cX_key-paper.csv` from `TCA_Ch3_DOIs_Dataset_Theories_3_May_2024.xlsx`
## This needs to be run manually

if (FALSE) {
    all <- readxl::read_excel(file.path("input", "key_papers", "TCA_Ch3_DOIs_Dataset_Theories_3_May_2024.xlsx"))

    cluster <- names(all)[11:16]

    sapply(
        1:6,
        function(i) {
            all |>
                dplyr::filter(.data[[cluster[[i]]]] == 1) |>
                dplyr::select("DOI", "Authors", "Year", "Title") |>
                dplyr::filter(!is.na(DOI)) |>
                dplyr::mutate(
                    DOI = gsub(
                        pattern = "https://doi.org/",
                        replacement = "",
                        x = DOI
                    )
                ) |>
                write.csv(
                    file = fns[i],
                    row.names = FALSE
                )
        }
    ) |>
        invisible()

    rm(all, cluster)
}

##
###########

cluster <- lapply(
    fns,
    function(fn) {
        list(
            timestamp = Sys.time(),
            fn = fn,
            dois = read.csv(fn)$DOI
        )
    }
)

names(cluster) <- paste0("cluster_", 1:6)

Verification of key papers

The key papers should describe the cluster, exactly one cluster, and not multiple clusters. In other words, a key paper should only be in one cluster. If a key-papers is in multiple clusters, this would lead to an overlap wof the snowball corpus which is not caused by the cluster itself, but by the key paper which is in multiple clusters.

The papers are the following (the table will be empty if there are none):

These papers will be excluded from the analysis.

Show the code
for (i in grep("cluster", names(cluster), value = TRUE)) {
    cluster[[i]]$dois <- cluster[[i]]$dois[!cluster[[i]]$dois %in% dupl_kp]
}

Do Searches

Get Generell key-works and Snowball

Show the code
#|

fn <- file.path("data", "key_works.rds")
if (exists(fn)) {
    key_works <- readRDS(fn)
} else {
    dois <- read.csv(file.path("input", "key_papers", "key-paper.csv"))$DOI
    key_works <- oa_fetch(
        entity = "works",
        doi = dois,
        verbose = FALSE
    )
    saveRDS(key_works, fn)
}

ids <- openalexR:::shorten_oaid(key_works$id)

fn <- file.path("data", "snowball.rds")
if (file.exists(fn)) {
    snowball <- readRDS(fn)
} else {
    snowball <- oa_snowball(
        identifier = ids,
        verbose = FALSE
    )
    saveRDS(snowball, fn)
}

flat_snow <- snowball2df(snowball) |>
    tibble::as_tibble()

Get Cluster key-works and Snowballs

Show the code
#|
tic()
fn <- file.path("data", "cluster.rds")
if (file.exists(fn)) {
    cluster <- readRDS(fn)
} else {
    cluster <- lapply(
        cluster,
        function(cl) {
            message("Processing ", cl$fn, " ...")
            message("|- Fetching works ...")
            cl$key_works <- oa_fetch(
                entity = "works",
                doi = cl$dois,
                verbose = FALSE
            )
            cl$key_works$id <- gsub(
                pattern = "https://openalex.org/",
                replacement = "",
                cl$key_works$id
            )
            message("|- Snowballing works ...")
            cl$snowball_1 <- oa_snowball(
                identifier = openalexR:::shorten_oaid(cl$key_works$id),
                verbose = FALSE
            )
            message("|- Converting to tibble ...")
            cl$snowball_1_df <- snowball2df(cl$snowball_1) |>
                as_tibble()
            message("\n")
            return(cl)
        }
    )

    ## Detemine linkages between Clusters

    nodes <- data.frame(
        id = names(cluster),
        n = sapply(
            cluster,
            function(cl) {
                nrow(cl$snowball_1$nodes)
            }
        ),
        n_kp = sapply(
            cluster,
            function(cl) {
                nrow(cl$key_works)
            }
        ),
        stringsAsFactors = FALSE
    )

    all_kps <- lapply(
        cluster,
        function(cl) {
            cl$key_works$id
        }
    )

    all_ids <- lapply(
        cluster,
        function(cl) {
            cl$snowball_1$nodes$id
        }
    )

    nodes$only_l1 <- sapply(
        1:nrow(nodes),
        function(i) {
            id_from <- cluster[[nodes$id[i]]]$key_works$id
            nms <- grep(nodes$id[i], names(all_ids), value = TRUE, invert = TRUE)
            id_to <- all_kps[nms] |>
                unlist() |>
                unique()
            sum(!(id_from %in% id_to))
        }
    )

    nodes$only_l2 <- sapply(
        1:nrow(nodes),
        function(i) {
            id_from <- cluster[[nodes$id[i]]]$key_works$id
            nms <- grep(nodes$id[i], names(all_ids), value = TRUE, invert = TRUE)
            id_to <- all_ids[nms] |>
                unlist() |>
                unique()
            sum(!(id_from %in% id_to))
        }
    )

    nodes$only_l3 <- sapply(
        1:nrow(nodes),
        function(i) {
            id_from <- cluster[[nodes$id[i]]]$snowball_1$nodes$id
            nms <- grep(nodes$id[i], names(all_ids), value = TRUE, invert = TRUE)
            id_to <- all_ids[nms] |>
                unlist() |>
                unique()
            sum(!(id_from %in% id_to))
        }
    )

    edges <- expand.grid(
        names(cluster),
        names(cluster),
        stringsAsFactors = FALSE
    )
    names(edges) <- c("from", "to")
    edges <- edges[edges$from != edges$to, ]
    rownames(edges) <- NULL

    ## in_l1: keypaper c1 %in% keypaper c2
    edges$in_l1 <- sapply(
        1:nrow(edges),
        function(i) {
            cluster[[edges$from[i]]]$key_works$id %in% cluster[[edges$to[i]]]$key_works$id |>
                sum()
        }
    )

    ## in_l2: keypaper c1 %in% nodes c2
    edges$in_l2 <- sapply(
        1:nrow(edges),
        function(i) {
            cluster[[edges$from[i]]]$key_works$id %in% cluster[[edges$to[i]]]$snowball_1$nodes$id |>
                sum()
        }
    )

    ## in_l3: nodes c1 %in% nodes c2
    edges$in_l3 <- sapply(
        1:nrow(edges),
        function(i) {
            cluster[[edges$from[i]]]$snowball_1$nodes$id %in% cluster[[edges$to[i]]]$snowball_1$nodes$id |>
                sum()
        }
    )

    cluster$links <- list(
        nodes = nodes,
        edges = edges
    )

    saveRDS(cluster, fn)
}


key_works_cluster <- lapply(
    1:(length(cluster) - 1),
    function(i) {
        cluster[[i]]$key_works
    }
) |>
    do.call(what = rbind) |>
    dplyr::distinct(id, .keep_all = TRUE)

snowball_cluster <- list(
    nodes = lapply(
        1:(length(cluster) - 1),
        function(i) {
            cluster[[i]]$snowball_1$nodes
        }
    ) |>
        do.call(what = rbind) |>
        dplyr::distinct(id, .keep_all = TRUE),
    edges = lapply(
        1:(length(cluster) - 1),
        function(i) {
            cluster[[i]]$snowball_1$edges
        }
    ) |>
        do.call(what = rbind) |>
        dplyr::distinct()
)

toc()
65.289 sec elapsed

Create Graphs

Create Static Citation Network Graph of the S1 Corpus

Only S1 corpus of S1 can be graphed due to the size of the S2 corpus.

Show the code
#|

tic()

lapply(
    names(cluster),
    function(name) {
        plot_name <- paste0(name, "_S1")
        if (length(list.files("figures", pattern = paste0(plot_name, ".*(pdf|png)"))) < 4) {
            message("Plotting ", name, " ...")
            try(
                plot_snowball(
                    cluster[[name]]$snowball_1,
                    name = plot_name
                ),
                silent = FALSE
            )
        }
    }
)

toc()

Create Interactive Citation Network Graph of the S1 Corpus

Show the code
#|

tic()

lapply(
    names(cluster),
    function(name) {
        plot_name <- paste0(name, paste0(name, "_S1_interactive_network.html"))
        if (!file.exists(file.path("figures", plot_name))) {
            message("Plotting Interactive", name, " ...")
            nwg <- IPBES.R::plot_snowball_interactive(
                snowball = cluster[[name]]$snowball_1,
                key_works = cluster[[name]]$key_works,
                file = file.path("figures", plot_name)
            )
        }
    }
)

toc()

Create Maps of the S1 Corpus

Create Maps of the first author only

Show the code
#|

tic()

maps <- lapply(
    grep("cluster_", names(cluster), value = TRUE),
    function(name) {
        plot_name <- paste0("map_first_author_", name, "__S1")
        message("Plotting map ", name, " ...")
        map <- sapply(
            cluster[[name]]$snowball_1$nodes$author,
            function(authors) {
                if (is.null(names(authors))) {
                    return(NULL)
                } else {
                    authors[["institution_country_code"]][[1]]
                }
            }
        ) |>
            unlist() |>
            table(useNA = "no") |>
            as.data.frame() |>
            dplyr::rename(
                iso2c = Var1,
                n = Freq
            ) |>
            dplyr::mutate(
                log_n_1 = log(n) + 1
            ) |>
            IPBES.R::map_country_codes(
                values = "log_n_1",
                geodata_path = params$gdm_path
            ) +
            ggplot2::ggtitle(paste(name, "First Author Only"))

        if (length(list.files("maps", pattern = paste0(plot_name, ".*(pdf|png)"))) < 2) {
            ggplot2::ggsave(
                file.path("maps", paste0(plot_name, ".pdf")),
                map,
                width = 5,
                height = 2.5
            )
            ggplot2::ggsave(
                file.path("maps", paste0(plot_name, ".png")),
                map,
                width = 5,
                height = 2.5
            )
        }
        return(map)
    }
)
Warning: Some values were not matched unambiguously: XK
Warning in IPBES.R::map_country_codes(dplyr::mutate(dplyr::rename(as.data.frame(table(unlist(sapply(cluster[[name]]$snowball_1$nodes$author, : The following countries are not in the world dataset: 
MCO, MDV, NA
and will therefore not be plotted!
Warning: Some values were not matched unambiguously: XK
Warning in IPBES.R::map_country_codes(dplyr::mutate(dplyr::rename(as.data.frame(table(unlist(sapply(cluster[[name]]$snowball_1$nodes$author, : The following countries are not in the world dataset: 
MCO, NA
and will therefore not be plotted!
Warning: Some values were not matched unambiguously: XK
Warning in IPBES.R::map_country_codes(dplyr::mutate(dplyr::rename(as.data.frame(table(unlist(sapply(cluster[[name]]$snowball_1$nodes$author, : The following countries are not in the world dataset: 
NA
and will therefore not be plotted!
Warning: Some values were not matched unambiguously: XK
Warning in IPBES.R::map_country_codes(dplyr::mutate(dplyr::rename(as.data.frame(table(unlist(sapply(cluster[[name]]$snowball_1$nodes$author, : The following countries are not in the world dataset: 
MCO, NA
and will therefore not be plotted!
Show the code
plot_name <- "map_first_author__S1"
if (length(list.files("maps", pattern = paste0("^", plot_name, ".*(pdf|png)"))) < 2) {
    map <- patchwork::wrap_plots(
        plots = maps,
        ncol = 3
    )
    ggplot2::ggsave(
        file.path("maps", paste0(plot_name, ".pdf")),
        map,
        width = 15,
        height = 5
    )
    ggplot2::ggsave(
        file.path("maps", paste0(plot_name, ".png")),
        map,
        width = 15,
        height = 5
    )
}
rm(map, maps)
Warning in rm(map, maps): object 'map' not found
Show the code
toc()
8.398 sec elapsed

Create Maps of the all authors. All authors are weighted equal for now.

Show the code
#|

tic()

maps <- lapply(
    grep("cluster_", names(cluster), value = TRUE),
    function(name) {
        plot_name <- paste0("map_all_authors_", name, "__S1")
        message("Plotting map ", name, " ...")
        map <- sapply(
            cluster[[name]]$snowball_1$nodes$author,
            function(authors) {
                if (is.null(names(authors))) {
                    return(NULL)
                } else {
                    authors[["institution_country_code"]]
                }
            }
        ) |>
            unlist() |>
            table(useNA = "no") |>
            as.data.frame() |>
            dplyr::rename(
                iso2c = Var1,
                n = Freq
            ) |>
            dplyr::mutate(
                log_n_1 = log(n) + 1
            ) |>
            IPBES.R::map_country_codes(
                values = "log_n_1",
                geodata_path = params$gdm_path
            ) +
            ggplot2::ggtitle(paste(name, "All Authors"))

        if (length(list.files("maps", pattern = paste0(plot_name, ".*(pdf|png)"))) < 2) {
            ggplot2::ggsave(
                file.path("maps", paste0(plot_name, ".pdf")),
                map,
                width = 5,
                height = 2.5
            )
            ggplot2::ggsave(
                file.path("maps", paste0(plot_name, ".png")),
                map,
                width = 5,
                height = 2.5
            )
        }
        return(map)
    }
)
Warning: Some values were not matched unambiguously: XK
Warning in IPBES.R::map_country_codes(dplyr::mutate(dplyr::rename(as.data.frame(table(unlist(sapply(cluster[[name]]$snowball_1$nodes$author, : The following countries are not in the world dataset: 
MCO, MDV, NA
and will therefore not be plotted!
Warning: Some values were not matched unambiguously: XK
Warning in IPBES.R::map_country_codes(dplyr::mutate(dplyr::rename(as.data.frame(table(unlist(sapply(cluster[[name]]$snowball_1$nodes$author, : The following countries are not in the world dataset: 
BMU, GIB, MCO, MDV, SMR, TUV, NA
and will therefore not be plotted!
Warning: Some values were not matched unambiguously: XK
Warning in IPBES.R::map_country_codes(dplyr::mutate(dplyr::rename(as.data.frame(table(unlist(sapply(cluster[[name]]$snowball_1$nodes$author, : The following countries are not in the world dataset: 
MDV, NA
and will therefore not be plotted!
Warning: Some values were not matched unambiguously: XK
Warning in IPBES.R::map_country_codes(dplyr::mutate(dplyr::rename(as.data.frame(table(unlist(sapply(cluster[[name]]$snowball_1$nodes$author, : The following countries are not in the world dataset: 
MCO, NA
and will therefore not be plotted!
Show the code
plot_name <- "map_all_authors__S1"
if (length(list.files("maps", pattern = paste0("^", plot_name, ".*(pdf|png)"))) < 2) {
    map <- patchwork::wrap_plots(
        plots = maps,
        ncol = 3
    )
    ggplot2::ggsave(
        file.path("maps", paste0(plot_name, ".pdf")),
        map,
        width = 15,
        height = 5
    )
    ggplot2::ggsave(
        file.path("maps", paste0(plot_name, ".png")),
        map,
        width = 15,
        height = 5
    )
}
rm(map, maps)
Warning in rm(map, maps): object 'map' not found
Show the code
toc()
9.523 sec elapsed

Results

Graphs and Maps

The following interactions are possible in the interactive graphs:

  • moving your mouse over a node, the title author and year of the paper is shown.
  • clicking on a node will open the paper in a new tab.
  • scrolling up and down with your scroll wheel zooms in and out
  • clicking on the canvas and move the mouse will move the network
  • clicking on a node and dragging it moves the node

Overview Maps

First Author Only

All Authors no weighting

Cluster 1

Ach so.

Cluster 2

Cluster 3

Cluster 4

Cluster 5

!

Cluster 6

Common works between clusters

This section looks at common works between the keypapers of each cluster as well as the whole corpus as obtained through a first generation snowball search using the provided keyworks.

The links are classifiad using three levels:

  • in_l1: (keypaper in from) in (keypaper in to)
  • in_l2: (keypaper in from) in (snowball in to)
  • in_l3: (snowball in from) in (snowball in to)

Cluster properties

  • n: number of papers in the snowball corpus
  • n_kp: number of keypapers
Show the code
IPBES.R::table_dt(data = cluster$links$nodes[, -1])

Common works between Cluster and General corpus

Overlab between Cluster and General key-papers and corpi

Here I will show the overlap between the combined cluster corpi and the General corpus based on the six key paper.

Cluster General overlap
(keypaper <-> keypaper) 194 6 2
(keypaper <-> paper) 194 2130 41
(paper <-> keypaper) 156127 6 6
(paper <-> paper) 156127 2130 1860

(keypaper <-> keypaper)

Show the code
key_works_cluster[key_works_cluster$id %in% gsub("^https://openalex.org/", "", key_works$id), ] |>
    IPBES.R::table_dt(fixedColumns = NULL)

(keypaper <-> paper)

Show the code
key_works_cluster[key_works_cluster$id %in% snowball$nodes$id, ] |>
    IPBES.R::table_dt(fixedColumns = NULL)

(paper <-> keypaper)

Show the code
snowball_cluster$nodes[snowball_cluster$nodes$id %in% gsub("^https://openalex.org/", "", key_works$id), ] |>
    IPBES.R::table_dt(fixedColumns = NULL)

(paper <-> paper)

Show the code
snowball_cluster$nodes[snowball_cluster$nodes$id %in% snowball$nodes$id, ] |>
    IPBES.R::table_dt(fixedColumns = NULL)
Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html